Louisville’s Nonprofit Landscape

An Overview

This document walks through some of our findings as well as the steps I took to get there. If you’re feeling very adventurous, you can click on the “code” buttons to the right for a tutorial on how to do my job, or you can leave them be and just pre
### Gathering the data

We start by figuring out which organizations are in Louisville. We’ll use data from the IRS Business Master File (downloaded 6/9/2022). This document contains a full listing of tax-exempt organizations across the county, as well as some basic financial info (revenue, expenses, and assets).

To subset the data to the Louisville area, we start by filtering the data to organizations with a zip code within the Louisville MSA. Zip codes cross county and MSA boundaries, so some of these organizations lie outside the MSA, and we’ll need to remove them.

About 15% of organizations have a PO box as their primary address. We include them in our list if more than 50% of their zip code is within the Louisville MSA. (We calculate the percent of the zip code in the Louisville MSA as the percent of the businesses in that zip code that are in the Louisville MSA.) For those in the Louisville MSA, we assign them to the county which contains the largest amount of that zip code.

The remaining 85% of businesses have a street address. For these organizations, we find the organization’s longitude and latitude using the Census Bureau’s address geocoding service, and we fill in missing values using the private service Geocodio. We then compare their latitude and longitude to a map of counties in the Louisville are to decide whether they are in the Louisville MSA and to decide which county they are located in.

While we’re working with our map data, we also assign nonprofits to neighborhoods and Metro Council Districts. As mentioned before, zip codes are included in the original dataset.

The map below shows the original list, as defined by zip codes. We remove the organizations represented by the blue dots, keeping only those in grey within the Louisville MSA boundaries. You can mouse over the list to see the organizations’ names.

# Read in IRS Business Master files and combine into one data frame
bmf1 <- read_csv("eo1.csv", col_types = "cccccccncnncnncnnnnnnnnnnncc")
bmf2 <- read_csv("eo2.csv", col_types = "cccccccncnncnncnnnnnnnnnnncc")
bmf3 <- read_csv("eo3.csv", col_types = "cccccccncnncnncnnnnnnnnnnncc")
bmf4 <- read_csv("eo4.csv", col_types = "cccccccncnncnncnnnnnnnnnnncc")

bmf <- bind_rows(bmf1, bmf2, bmf3, bmf4)

rm(bmf1, bmf2, bmf3, bmf4)

ntee_categories <- read_csv("NTEE categories.csv", col_types = "ccc")
ntee_codes <- read_csv("NTEE codes.csv", col_types = "ccc")
subsection_codes <- read_csv("Subsection codes.csv", col_types = "ccc")
ntee_to_cnpe <- read_csv("CNPE to NTEE.csv", col_types = "ccc")

# https://github.com/Nonprofit-Open-Data-Collective/irs-exempt-org-business-master-file#activity-codes
activity_to_ntee <- readxl::read_excel("Activity to NTEE.xlsx", sheet = 2)

ntee_categories %<>%
  rename(
    NTEE_cat = `NTEE Code`) %>%
  left_join(ntee_to_cnpe, by = c("NTEE_cat" = "NTEE Code", "Description" = "NTEE"))

ntee_codes %<>%
  rename(
    NTEE_code = `NTEE Code`)

activity_to_ntee %<>%
  transmute(
    ACTIVITY = str_pad(ACTIVITY, 3, "left", "0"),
    NTEE_replacement = if_else(!is.na(NTEE_replacement), NTEE_replacement, NTEE))
bmf %<>%
  mutate(
    EIN,
    name = NAME,
    street = STREET,
    city = CITY,
    state = STATE,
    zip = str_sub(ZIP, 1, 5),
    tax_year = str_sub(TAX_PERIOD, 1, 4) %>% as.numeric(),
    creation = RULING,
    filing_type = if_else(PF_FILING_REQ_CD == 1, "990PF",
                  if_else(FILING_REQ_CD %in% c(1, 3, 4), "990",
                  if_else(FILING_REQ_CD %in% c(2), "990N",
                  "other"))),
    revenue = REVENUE_AMT,
    assets = ASSET_AMT,
    activity_codes = ACTIVITY,
    NTEE_code = NTEE_CD,
    NTEE_cat = str_sub(NTEE_CD, 1, 1))
# MSA_zip_crosswalk <- glptools::MSA_zip %>%
#   filter(MSA == "31140") %>%
#   mutate(business_in_county = round(business_in_county, 1))

MSA_zip_crosswalk <- glptools::MSA_zip %>%
  filter(MSA == "31140")

bmf_lou <- bmf %>% 
  left_join(MSA_zip_crosswalk, by = "zip") %>%
  filter(!is.na(MSA)) %>%
  select(EIN, street, city, state, zip)

# bmf_sure <- bmf %>%
#   filter(business_in_county == 100) %>%
#   mutate(lou_msa = "sure")
# 
# bmf_check <- bmf %>%
#   filter(business_in_county > 0, business_in_county < 100) %>%
#   mutate(lou_msa = "unsure")
# 
# output <- bind_rows(bmf_sure, bmf_check)

#write_csv(output, "Louisville Nonprofits 6_9_2022.csv")

# Remove PO Boxes as they will not be addressed
bmf_lou_po <- bmf_lou %>%
  filter(str_detect(street, "PO BOX|PO B0X"))

bmf_lou_remaining <- bmf_lou %>%
  anti_join(bmf_lou_po, by = c("EIN", "street", "city", "state", "zip"))

bmf_lou_po %<>% 
  left_join(MSA_zip_crosswalk, by = "zip") %>%
  filter(business_in_county >= 50) %>%
  select(EIN, street, city, state, zip)

# Geocode using the Census Bureau geocoder
bmf_lou_census <- bmf_lou_remaining %>%
  geocode(
    street = street,
    city = city,
    state = state,
    postalcode = zip,
    method = "census")

bmf_lou_remaining <- bmf_lou_census %>%
  filter(is.na(lat)) %>%
  select(-lat, -long)

bmf_lou_census %<>%
  filter(!is.na(lat))

# Geocode using the geocodio geocoder

Sys.setenv(GEOCODIO_API_KEY = "cccff3c3cc3aca633fc09ccc3901c1a861a9069")

bmf_lou_geocodio <- bmf_lou_remaining %>%
  geocode(
    street = street,
    city = city,
    state = state,
    postalcode = zip,
    method = "geocodio")

# Bind geocoded information and save
bmf_lou_geocoded <- bind_rows(bmf_lou_census, bmf_lou_geocodio, bmf_lou_po)

bmf_lou_geocoded %<>%
  select(EIN, lat, long)

save(bmf_lou_geocoded, file = "bmf_lou_geocoded.RData")
load("bmf_lou_geocoded.RData")

# For PO boxes, assign EINS to the county that contains the majority of the zip code
bmf_lou_POs <- bmf_lou_geocoded %>%
  filter(is.na(lat)) %>%
  left_join(bmf, by = "EIN") %>%
  select(EIN, zip) %>%
  left_join(FIPS_zip_full_MSA, by = "zip") %>%
  group_by(EIN) %>%
  arrange(desc(business_in_county)) %>%
  filter(row_number() == 1) %>%
  ungroup() %>%
  select(EIN, FIPS)
  
# For other addresses, create sf object and join with an MSA and neighborhood map
bmf_lou_addressed <- bmf_lou_geocoded %>%
  filter(!is.na(lat)) %>%
  st_as_sf(coords = c("long", "lat"), crs = 4326)

bmf_lou_county <- bmf_lou_addressed %>%
  st_join(map_msa_lou, st_within)

bmf_lou_nh <- bmf_lou_addressed %>%
  st_join(map_nh, st_within) %>%
  st_drop_geometry()

bmf_lou_district <- bmf_lou_addressed %>%
  st_join(map_district, st_within) %>%
  st_drop_geometry()
pal <- colorFactor(palette = c("#00a9b7FF", "#333333"),
                   domain = c("Keep", "Drop")) 

labels <- sprintf("%s",
                  left_join(bmf_lou_county, bmf, by = "EIN")$name) %>%
        lapply(htmltools::HTML)

leaflet() %>%
  addPolygons(data = map_msa_lou, 
              color = "black",
              opacity = 1,
              weight = 3,
              fillOpacity = 0) %>%
  addCircleMarkers(data = mutate(bmf_lou_county, keep = if_else(is.na(FIPS), "Drop", "Keep")),
                   radius = 2,
                   color = ~pal(keep),
                   label = labels,
      labelOptions = labelOptions(
        style = list("font-weight" = "normal", "font-family" = "Arial", padding = "3px 8px"),
        textsize = "15px",
        direction = "auto")) %>%
  addLegend(pal = pal, values = c("Drop", "Keep")) %>%
  addTiles() %>%
  setView(-85.6, 38.2, zoom = 8)
rm(pal)
# Filter to EINs in Lou MSA and remove county polygons from the data frame
bmf_lou_county %<>%
  st_drop_geometry() %>%
  filter(!is.na(FIPS)) %>%
  select(-county)

# Add neighborhood names to the data frame
bmf_lou_county %<>%
  left_join(bmf_lou_nh, by = "EIN") %>%
  left_join(bmf_lou_district, by = "EIN")

# Add PO box data 
bmf_lou_county %<>%
  bind_rows(bmf_lou_POs)

bmf_lou <- bmf_lou_county

bmf_lou_points <- bmf_lou_addressed %>%
  filter(EIN %in% bmf_lou$EIN)

rm(bmf_lou_geocoded, bmf_lou_county, bmf_lou_nh, bmf_lou_district, bmf_lou_POs, bmf_lou_addressed)
filing_data <- any_time("org_indices", starting_year = 2020, col_types = "cccccccccn")

filing_data %<>%
  filter(EIN %in% bmf_lou$EIN) %>%
  mutate(OBJECT_ID = str_remove(OBJECT_ID, ","))

# Create list of XML files
files_df <- data.frame(
  folder = character(),
  file = character())

for(i in 1:8) {
  
  this_folder <- "xml_files/download990xml_2020_" %p% i
  
  these_files <- list.files(this_folder)
  
  temp <- data.frame(folder = this_folder,
                     file = these_files)
  
  files_df %<>% bind_rows(temp)
}

for(i in 1:6) {
  
  this_folder <- "xml_files/download990xml_2021_" %p% i
  
  these_files <- list.files(this_folder)
  
  temp <- data.frame(folder = this_folder,
                     file = these_files)
  
  files_df %<>% bind_rows(temp)
}

for(i in 1:1) {
  
  this_folder <- "xml_files/download990xml_2022"# %p% i
  
  these_files <- list.files(this_folder)
  
  temp <- data.frame(folder = this_folder,
                     file = these_files)
  
  files_df %<>% bind_rows(temp)
}

files_df %<>%
  mutate(file_name = str_remove(file, "_public\\.xml"),
         path = paste0(folder, "/", file)) %>%
  select(file_name, path)

filing_data %<>% left_join(files_df, by = c("OBJECT_ID" = "file_name"))

# Pull XML files into a large list
output <- list()

for (i in 1:nrow(filing_data)){
  xml_obj <- tryCatch({
   XML::xmlParse(filing_data$path[i])
  },
  error = function(e) list("missing"))
  
  output <- c(output, xml_obj)
  
  print(i)
}


#test <- read_xmls("xml_files/download990xml_2022")

# Function to get certain data points from these columns
safe_get <- function(obj, cols) {
  
  for (c in cols) {
    
    these_cols <- names(obj) == c
    
    if (c == "CompensationAmt") {
      
      compensations <- flatten(obj)[names(flatten(obj)) == "CompensationAmt"]
      
      jobs <- as.character(sum(compensations != "0"))
      
      return(jobs)
    }
    
    if (sum(these_cols) == 1) {
      obj <- obj[[c]] 
    } else {
      obj <- obj[names(obj) == c]
      
      obj <- map(obj, function(x) flatten(x))
      
    }
  }
  
  if(length(obj) == 0) return(NA_character_)
  
  obj
  
}

extract_fxn <- function(xml_obj) {
  
  if (is.character(xml_obj)) return(data.frame())
  
  df <- xml_obj %>%
      xmlToList()
  
  all_data <- df
  
  this_EIN = safe_get(df[["ReturnHeader"]][["Filer"]], "EIN")
  this_return_type = safe_get(df[["ReturnHeader"]], "ReturnTypeCd")
  this_return_period = safe_get(df[["ReturnHeader"]], "TaxPeriodEndDt")
  
  if (this_return_type == "990") {
    
    df <- df[["ReturnData"]][["IRS990"]]
    
    output_df <- data.frame(
      
      EIN = this_EIN,
      return_type = this_return_type,
      return_period = this_return_period,
      
      PYContributionsGrantsAmt = safe_get(df, "PYContributionsGrantsAmt"),
      CYContributionsGrantsAmt = safe_get(df, "CYContributionsGrantsAmt"),
      
      PYProgramServiceRevenueAmt = safe_get(df, "PYProgramServiceRevenueAmt"),
      CYProgramServiceRevenueAmt = safe_get(df, "CYProgramServiceRevenueAmt"),
      
      PYInvestmentIncomeAmt = safe_get(df, "PYInvestmentIncomeAmt"),
      CYInvestmentIncomeAmt = safe_get(df, "CYInvestmentIncomeAmt"),
      
      PYOtherRevenueAmt = safe_get(df, "PYOtherRevenueAmt"),
      CYOtherRevenueAmt = safe_get(df, "CYOtherRevenueAmt"),
      
      PYRevenuesLessExpensesAmt = safe_get(df, "PYRevenuesLessExpensesAmt"),
      CYRevenuesLessExpensesAmt = safe_get(df, "CYRevenuesLessExpensesAmt"),
      
      PYSalariesCompEmpBnftPaidAmt = safe_get(df, "PYSalariesCompEmpBnftPaidAmt"),
      CYSalariesCompEmpBnftPaidAmt = safe_get(df, "CYSalariesCompEmpBnftPaidAmt"),
      
      PYTotalExpensesAmt = safe_get(df, "PYTotalExpensesAmt"),
      CYTotalExpensesAmt = safe_get(df, "CYTotalExpensesAmt"),
      
      MissionDesc = safe_get(df, "MissionDesc"),
      
      WebsiteAddressTxt = safe_get(df, "WebsiteAddressTxt"),
      
      employees = safe_get(df, "EmployeeCnt"),
      
      
      FederatedCampaignsAmt = safe_get(df, "FederatedCampaignsAmt"),
      MembershipDuesAmt = safe_get(df, "MembershipDuesAmt"),
      FundraisingAmt = safe_get(df, "FundraisingAmt"),
      RelatedOrganizationsAmt = safe_get(df, "RelatedOrganizationsAmt"),
      GovernmentGrantsAmt = safe_get(df, "GovernmentGrantsAmt"),
      AllOtherContributionsAmt = safe_get(df, "AllOtherContributionsAmt"),
      NonCashContributionsAmt = safe_get(df, "NonCashContributionsAmt"),
      TotalContributionsAmt = safe_get(df, "TotalContributionsAmt"),
      
      GrantsToDomesticOrgsGrp = safe_get(df, c("GrantsToDomesticOrgsGrp", "TotalAmt")),
      GrantsToDomesticIndividualsGrp = safe_get(df, c("GrantsToDomesticIndividualsGrp", "TotalAmt")),
      CompCurrentOfcrDirectorsGrp = safe_get(df, c("CompCurrentOfcrDirectorsGrp", "TotalAmt")),
      OtherSalariesAndWagesGrp = safe_get(df, c("OtherSalariesAndWagesGrp", "TotalAmt")),
      PayrollTaxesGrp = safe_get(df, c("PayrollTaxesGrp", "TotalAmt")),
      FeesForServicesAccountingGrp = safe_get(df, c("FeesForServicesAccountingGrp", "TotalAmt")),
      AdvertisingGrp = safe_get(df, c("AdvertisingGrp", "TotalAmt")),
      OfficeExpensesGrp = safe_get(df, c("OfficeExpensesGrp", "TotalAmt")),
      InformationTechnologyGrp = safe_get(df, c("InformationTechnologyGrp", "TotalAmt")),
      ConferencesMeetingsGrp = safe_get(df, c("ConferencesMeetingsGrp", "TotalAmt")),
      InsuranceGrp = safe_get(df, c("InsuranceGrp", "TotalAmt")),
      #OtherExpensesGrp = safe_get(df, c("GrantsToDomesticOrgsGrp", "TotalAmt")),
      TotalFunctionalExpensesGrp = safe_get(df, c("TotalFunctionalExpensesGrp", "TotalAmt")))
  } else if (this_return_type == "990EZ") {
    
    df <- df[["ReturnData"]][["IRS990EZ"]]
     
    output_df <- data.frame(
      
      EIN = this_EIN,
      return_type = this_return_type,
      return_period = this_return_period,
      
      CYContributionsGrantsAmt = safe_get(df, "ContributionsGiftsGrantsEtcAmt"),
      
      CYProgramServiceRevenueAmt = safe_get(df, "ProgramServiceRevenueAmt"),
      
      PYRevenuesLessExpensesAmt = safe_get(df, "PYRevenuesLessExpensesAmt"),
      CYRevenuesLessExpensesAmt = safe_get(df, "CYRevenuesLessExpensesAmt"),
      
      CYSalariesCompEmpBnftPaidAmt = safe_get(df, "SalariesOtherCompEmplBnftAmt"),
      
      CYTotalExpensesAmt = safe_get(df, "TotalExpensesAmt"),
      
      GrantsAndSimilarAmountsPaidAmt = safe_get(df, "GrantsAndSimilarAmountsPaidAmt"),
      
      TotalExpensesAmt = safe_get(df, "GrantsAndSimilarAmountsPaidAmt"),
      
      MissionDesc = safe_get(df, "MissionDesc"),
      
      WebsiteAddressTxt = safe_get(df, "WebsiteAddressTxt"),
      
      employees = safe_get(df, c("OfficerDirectorTrusteeEmplGrp", "CompensationAmt")))
  } else if (this_return_type %in% c("990PF", "990T")) {
    
     df <- df[["ReturnData"]][["IRS990PF"]]
    
    output_df <- data.frame(
      
      EIN = this_EIN,
      return_type = this_return_type,
      return_period = this_return_period)
  }
  
  return(output_df)
  
}

# extract_layout <- function(xml_obj) {
#   
#   if (is.character(xml_obj)) return(data.frame())
#   
#   df <- xml_obj %>%
#       xmlToList()
#   
#   df
# }
# 
# output_layout <- map(output, extract_layout)
# 
# save(output_layout, file = "output_layout.RData")


library(tictoc)

tic()
output_internal_data <- map_dfr(output, extract_fxn)
toc()

#save(output_internal_data, file = "output_internal_data2.RData")

load("output_layout.RData")
load("output_internal_data2.RData")

test = output_internal_data %>%
  mutate(across(PYContributionsGrantsAmt:CYSalariesCompEmpBnftPaidAmt, as.numeric)) %>%
  mutate(across(employees:TotalFunctionalExpensesGrp, as.numeric)) %>%
  mutate(across(FederatedCampaignsAmt:TotalContributionsAmt, ~replace_na(., 0))) %>%
  mutate(revenue_check = TotalContributionsAmt - (FederatedCampaignsAmt + MembershipDuesAmt + FundraisingAmt + 
           RelatedOrganizationsAmt + GovernmentGrantsAmt + AllOtherContributionsAmt + NonCashContributionsAmt) == 0)
load("output_internal_data2.RData")
df_990_2021 <- read_csv("21eoextract990.csv")
df_990ez_2021 <- read_csv("21eoextractez.csv")

test <- df_990_2021 %>%
  transmute(
    EIN,
    tax_period = tax_pd,
    employees = noemplyeesw3cnt)


df_990_2020 <- readxl::read_excel("20eoextract990.xlsx", progress = TRUE)
df_990ez_2021 <- readxl::read_excel("21eoextractez.xlsx", progress = TRUE)
df_990ez_2020 <- readxl::read_excel("20eoextractez.xlsx", progress = TRUE)

Classifying organizations

Deciding which organizations fit where is complicated, to say the least. We have a couple ways of approaching it:

  • IRS subsection - Are you a 501(c)(3), a 501(c)(4), etc.?
  • Activity and NTEE codes - Application for nonprofit status ask organizations to classify their work. Until the mid-1990s, the IRS used a list of 266 “activity codes.” Organizations could list 1-3 activity codes. Beginning in the mid-1990s, the IRS began using the NTEE classification system, a revised set of categories with more uniformity and consistency. NTEE codes are hierarchical, with 26 top-level groups (e.g. Education, Health Care, Housing & Shelter), 211 sub-categories (e.g. PreK-12 Education, Hospitals, Housing Creation and Rehabilitation), and 655 lower-level groups (e.g. Preschools, Senior Citizens Housing & Retirement Communities, Specialty Hospitals). I refer to the 26 top-level groups as NTEE categories, the 211 middle-level groups as NTEE sub-categories, and the 655 specific groups as NTEE codes.
    • NTEE codes are composed of a letter and a two-digit number, like (e.g. B21). The letter indicates the NTEE category, the first digit indicates the sub-category, and the second digit indicates the code. For example, B represents education. Some codes within education are:
      • The B10s: organizations that fundraiser for educational organizations. B11 includes nonprofits that support a single organization, while B12 includes nonprofits that support multiple organizations.
      • The B20s are prek-grade 12 schools. (E.g. B21, B24, B25, B28, B29.)
      • B30 is vocational schools.
      • The B40s are colleges and universities of all kinds.
      • Etc.
    • Many codes are common across categories. For example, B11 represents Single Organization Support for educational nonprofits (the B category), while C11 represents Single Organization Support for environmental nonprofits (the C category).
    • Going from activity codes to NTEE codes is possible, but not precise. While we can generally connect activity codes to top-level NTEE codes, activity codes often relate to more than one specific NTEE code.)
  • Foundation classification - by default, all 501(c)(3)s are classified as “private foundations” unless they demonstrate that they are not, which makes them a “public charity”.
bmf_lou %<>%
  left_join(bmf, by = "EIN")

# Change AMVETS from subsection 91 to 19.
bmf_lou$SUBSECTION[bmf_lou$name == "AMVETS"] <- "19"

# Change Thornton to employee benefit trust
bmf_lou$SUBSECTION[bmf_lou$EIN == "611040959"] <- "09"

# Classify 501(c)(3) organizations as foundations, Churches/Religious orgs, Government entities, or charitable nonprofits
bmf_lou %<>%
  left_join(subsection_codes, by = "SUBSECTION") %>%
  mutate(
    org_type = case_when(
      SUBSECTION != "03" ~ `short name`,
      PF_FILING_REQ_CD == 1 ~ "Foundation",
      FILING_REQ_CD %in% c(6, 13) ~ "Church or Religious Organization",
      FILING_REQ_CD == 14 ~ "Government Entities",
      # other FILING_REQ_CD: 1, 2, 3
      FILING_REQ_CD %in% 1:3 ~ "Charitable Nonprofit",
      FOUNDATION %in% c(10) ~ "Church or Religious Organization",
      FOUNDATION %in% c(4, 11, 16, 17) ~ "Foundation",
      FOUNDATION %in% c(15) ~ "Government Entities"))

# Replace missing NTEE codes with an activity code to NTEE crosswalk
bmf_lou %<>%
  mutate(activity_code_1 = str_sub(ACTIVITY, 1, 3)) %>%
  left_join(activity_to_ntee, by = c("activity_code_1" = "ACTIVITY")) %>%
  mutate(
    NTEE_code = if_else(is.na(NTEE_code), NTEE_replacement, NTEE_code),
    NTEE_cat = str_sub(NTEE_code, 1, 1))

# Add NTEE1 and NTEE2
bmf_lou %<>%
  left_join(ntee_categories, by = "NTEE_cat") %>%
  mutate(NTEE1 = Description) %>%
  select(-Description, -Definition) %>%
  left_join(ntee_codes, by = "NTEE_code") %>%
  mutate(NTEE2 = Description)%>%
  select(-Description, -Definition) %>%
  mutate(NTEE2 = if_else(!is.na(NTEE1) & is.na(NTEE2), "uncategorized", NTEE2))
library(sunburstR)
library(d3r)

df <- bmf_lou_classified %>%
  select(org_type, NTEE1, NTEE2, name, revenue) %>%
  mutate(
    org_type = str_replace(org_type, "-", " "),
    NTEE1 = str_replace(NTEE1, "-", " "),
    NTEE2 = str_replace(NTEE2, "-", " "),
    name = str_replace(name, "-", " "),
    revenue = if_else(revenue < 0, 0, revenue)) %>%
  mutate(org_type = replace_na(org_type, "uncategorized"),
         NTEE1 = replace_na(NTEE1, "uncategorized")) %>%
  count(org_type, NTEE1, NTEE2, name) %>%
  rename(size = n)

tree <- d3_nest(df, value_cols = "size")

sunburst(tree, width="100%", height=600, legend = FALSE)

sund2b(tree, width="100%")
org_type1 <- bmf_lou_classified %>%
  group_by(org_type) %>%
  summarize(n = n(), .groups = "drop")

org_type2 <- bmf_lou_classified %>%
  filter(org_type == "Charitable Nonprofit") %>%
  group_by(NTEE1) %>%
  summarize(n = n(), .groups = "drop")

org_type3 <- bmf_lou_classified %>%
  filter(org_type == "Charitable Nonprofit") %>%
  group_by(NTEE2) %>%
  summarize(n = n(), .groups = "drop") %>%
  mutate(NTEE_cat = str_sub(NTEE2, 1, 1)) %>%
  left_join(ntee_categories, by = "NTEE_cat") 


org_type1 <- bmf_lou_classified %>%
  group_by(org_type) %>%
  summarize(n = sum(revenue, na.rm = TRUE), .groups = "drop")

org_type2 <- bmf_lou_classified %>%
  filter(org_type == "Charitable Nonprofit") %>%
  group_by(NTEE1) %>%
  summarize(n = sum(revenue, na.rm = TRUE), .groups = "drop")

org_type3 <- bmf_lou_classified %>%
  filter(org_type == "Charitable Nonprofit") %>%
  group_by(NTEE2) %>%
  summarize(n = sum(revenue, na.rm = TRUE), .groups = "drop")

Subsection codes

Our first way of separating out types of nonprofits is to look at thir subsection codes. We are primarily concerned with 501(c)(3)s, or charitable nonprofits, and all information from here on out will focus on those.

bmf_lou %>%
  group_by(SUBSECTION, `short name`) %>%
  count() %>%
  ungroup() %>%
  gt()
SUBSECTION short name n
01 Credit Unions 4
02 Title-holding Organization 10
03 Charitable Nonprofits 5406
04 Civic Associations 259
05 Labor and Agricultural Organizations 114
06 Business Associations 235
07 Social Clubs and Greek Organizations 248
08 Fraternal Societies 184
09 Employee Benefit Trusts 15
10 Fraternal Societies 53
12 Utilities 16
13 Cemeteries 54
14 Credit Unions 7
15 Mutual Insurance 1
19 Veteran Support Organizations 82
92 Trusts 11

Foundation Codes

501(c)(3)s are further grouped by foundation codes. The biggest distinction this creates is between private foundations and publicly-supported charities. At this stage, we classify organizations into the following groups: * Foundations, * Churches * Schools * Hospital and Medical Research Organizations, * Government Entities * Public Charities

bmf_lou %<>%
  mutate(
    org_type = case_when(
      SUBSECTION != "03" ~ `short name`,
      FOUNDATION %in% c(2:4, 21) ~ "Foundation",
      FOUNDATION == 10 ~ "Church or Religious Organization",
      FOUNDATION %in% c(11, 13, 23) ~ "School",
      FOUNDATION %in% c(12, 22, 24) ~ "Hospital or Medical Research Org",
      FOUNDATION == 14 ~ "Government Entity",
      FOUNDATION %in% 15:17 ~ "Public charity"))
      # other FILING_REQ_CD: 1, 2, 3
      # FILING_REQ_CD == 13 ~ "Church or Religious Organization",
      # FILING_REQ_CD == 14 ~ "Government Entities"))

bmf_lou %>%
  filter(SUBSECTION == "03") %>%
  group_by(org_type) %>%
  count() %>%
  ungroup() %>%
  gt() %>%
  apply_table_settings()
org_type n
Church or Religious Organization 943
Foundation 458
Government Entity 2
Hospital or Medical Research Org 51
Public charity 3872
School 80

NTEE Codes

The next grouping we can create are NTEE Codes.

output_internal_data %<>%
  group_by(EIN) %>%
  arrange(desc(return_period)) %>%
  filter(row_number() == 1) %>%
  ungroup()

output_internal_data %<>%
  mutate(across(PYContributionsGrantsAmt:CYSalariesCompEmpBnftPaidAmt, as.numeric)) %>%
  mutate(across(employees:TotalFunctionalExpensesGrp, as.numeric)) %>%
  mutate(across(FederatedCampaignsAmt:TotalContributionsAmt, ~replace_na(., 0)))
  # mutate(revenue_check = TotalContributionsAmt - (FederatedCampaignsAmt + MembershipDuesAmt + FundraisingAmt + 
  #          RelatedOrganizationsAmt + GovernmentGrantsAmt + AllOtherContributionsAmt + NonCashContributionsAmt) == 0)

bmf_lou %<>%
  left_join(output_internal_data, by = "EIN")
# 
# test <- bmf_lou %>%
#   select(EIN, SUBSECTION, org_type, revenue) %>%
#   filter(SUBSECTION == "03", org_type == "Public charity") %>%
#   left_join(output_internal_data, by = "EIN")
# 
# test2 <- bmf_lou %>%
#   anti_join(test, by = "EIN") %>%
#   left_join(output_internal_data, by = "EIN") %>%
#   filter(EIN %in% output_internal_data$EIN)
# 
# test_990 <- test %>%
#   filter(EIN %in% output_internal_data$EIN)
# 
# test_no_990 <- test %>%
#   filter(EIN %not_in% output_internal_data$EIN)
ntee_df <- bmf_lou %>%
  filter(SUBSECTION == "03", org_type == "Public charity") %>%
  group_by(CNPE) %>%
  summarize(
    Nonprofits = n(),
    Revenue = sum(revenue, na.rm = T),
    Employees = sum(employees, na.rm = TRUE),
    Payroll = sum(CYSalariesCompEmpBnftPaidAmt, na.rm = TRUE)) %>%
  ungroup()

ntee_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by CNPE Category",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)
ntee_df %>%
  gt() %>%
  fmt_integer(columns = c(Nonprofits, Employees)) %>%
  fmt_currency(columns = c(Revenue, Payroll), suffix = T) %>%
  apply_table_settings()
CNPE Nonprofits Revenue Employees Payroll
Animal-Related 114 $16.97M 229 $5.16M
Arts, Culture & Humanities 371 $105.25M 1,893 $56.32M
Civil Rights, Social Action, Advocacy 34 $15.83M 237 $9.50M
Community Improvement, Capacity Building 155 $30.58M 78 $3.22M
Education 676 $591.23M 963 $23.26M
Employment, Job Related 24 $87.94M 3,322 $43.80M
Environmental Quality, Protection and Beautification 74 $17.51M 155 $4.66M
Food, Agriculture, and Nutrition 45 $57.64M 96 $4.62M
Health Services 320 $1.77B 15,499 $1.11B
Human Services 952 $756.84M 15,118 $372.01M
International, Foreign Affairs, and National Security 51 $9.20M 56 $2.62M
Mutual/Membership Benefit 10 $7.42M 0 $0.00
Public, Society Benefit 55 $12.53M 40 $2.61M
Recreation, Sports, Leisure, Athletics 290 $202.41M 918 $56.80M
Religion Related, Spiritual Development 279 $69.47M 588 $18.14M
Research 15 $1.23M 5 $588.42K
Unknown 78 $7.56M 165 $2.97M
Volunteerism 150 $287.29M 615 $16.19M
NA 179 $52.96M 535 $13.63M

Who’s in Lou?

Download all data here

bmf_lou %>%
  downloadthis::download_this(
    output_name = "Business Master File",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)

By County

Jefferson County has the largest number of nonprofits, though Harrison County happens to have the most per capita! You can mouse over the map below to see the numbers for each county. Click on the “Table” tab to see the data in a table format and to download the information.

Map

county_pop_df <- glpdata::population_msa_counties %>%
  filter(year == 2019,
         sex == "total", 
         race == "total")

county_df <- bmf_lou %>%
  group_by(FIPS) %>%
  count() %>%
  ungroup() %>%
  left_join(county_pop_df, by = "FIPS") %>%
  left_join(MSA_FIPS_info, by = "FIPS") %>%
  transmute(
    FIPS,
    County = county %p% if_else(str_sub(FIPS, 1, 2) == "18", 
                                ", IN", ", KY"),
    Nonprofits = n,
    `Nonprofits per 1000 Residents` = round(n / population * 1000, 1))

county_df_csi <- bmf_lou %>%
  group_by(FIPS) %>%
  filter(SUBSECTION == "03", org_type == "Public charity") %>%
  summarize(
    CSIs = n(),
    Revenue = sum(revenue, na.rm = T),
    Employees = sum(employees, na.rm = TRUE),
    Payroll = sum(CYSalariesCompEmpBnftPaidAmt, na.rm = TRUE))

county_df %<>%
  left_join(county_df_csi, by = c("FIPS")) %>%
  select(-FIPS)

county_df %<>%
  mutate(
    county = str_extract(County, "^.*(?= County)")) %>%
  left_join(map_msa_lou, by = "county")

pal <- leaflet::colorNumeric(
  palette = RColorBrewer::brewer.pal(9, "BuPu"),
  domain = range(county_df$`Nonprofits per 1000 Residents`))

labels <- sprintf("%s<br/>%s<br/>%s",
                  county_df$County,
                  "Nonprofit HQs per 1,000 residents: " %p% county_df$`Nonprofits per 1000 Residents`,
                  "Total Nonprofits: " %p% county_df$`Nonprofits`) %>%
        lapply(htmltools::HTML)

leaflet(st_as_sf(county_df)) %>%
  addPolygons(
      color = "#444444", weight = 1, 
      smoothFactor = 0.5, opacity = 1.0, fillOpacity = 0.5,
      fillColor = ~pal(`Nonprofits per 1000 Residents`),
      label = labels,
      labelOptions = labelOptions(
        style = list("font-weight" = "normal", "font-family" = "Arial", padding = "3px 8px"),
        textsize = "15px",
        direction = "auto"))

Table

county_df %<>%
  select(County, Nonprofits, `Nonprofits per 1000 Residents`, 
         `Charitable Social Impact Orgs` = CSIs, Revenue, Payroll, Employees)

county_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by County",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)
county_df %>%
  gt() %>%
  fmt_integer(columns = c(Nonprofits, `Charitable Social Impact Orgs`, Employees)) %>%
  fmt_currency(columns = c(Revenue, Payroll), suffix = T) %>%
  tab_header(title = "Nonprofits by county",
             subtitle = "") %>%
  tab_spanner(
    label = "Nonprofits",
    columns = vars(Nonprofits, `Nonprofits per 1000 Residents`)) %>%
  tab_spanner(
    label = "Charitable Social Impact Organizations",
    columns = vars(`Charitable Social Impact Orgs`, Revenue, Employees, Payroll)) %>%
  cols_label(
    Nonprofits = "Number",
    `Nonprofits per 1000 Residents` = "Number per 1000 Residents", 
    `Charitable Social Impact Orgs` = "Number",
    Revenue = "Total Revenue") %>%
  apply_table_settings()
Nonprofits by county
County Nonprofits Charitable Social Impact Organizations
Number Number per 1000 Residents Number Total Revenue Employees Payroll
Clark County, IN 485 4.1 247 $113.47M 1,455 $57.26M
Floyd County, IN 373 4.8 218 $64.65M 510 $15.11M
Harrison County, IN 239 6.0 135 $21.81M 189 $2.11M
Washington County, IN 139 5.0 71 $4.67M 68 $762.35K
Bullitt County, KY 243 3.0 135 $37.42M 684 $18.94M
Henry County, KY 63 4.0 38 $3.85M 12 $387.39K
Jefferson County, KY 4,554 5.9 2,678 $3.81B 37,061 $1.64B
Oldham County, KY 310 4.7 186 $24.90M 365 $9.42M
Shelby County, KY 216 4.6 123 $10.18M 166 $2.58M
Spencer County, KY 77 4.2 41 $510.46K 2 $89.87K

By Metro Council District

Unsurprisingly (to me, at least) Downtown has the largest concentration of nonprofit headquarters. Since Districts have approximately the same number of residents and I didn’t have the data on hand, I did not include per capita figures.

Map

council_df <- bmf_lou %>%
  group_by(district) %>%
  count() %>%
  ungroup() %>%
  filter(!is.na(district)) %>%
  transmute(
    `Council District` = district,
    Nonprofits = n)

council_df %<>%
  left_join(map_district, by = c("Council District" = "district"))

council_df_csi <- bmf_lou %>%
  group_by(district) %>%
  filter(SUBSECTION == "03", org_type == "Public charity") %>%
  summarize(
    CSIs = n(),
    Revenue = sum(revenue, na.rm = T),
    Employees = sum(employees, na.rm = TRUE),
    Payroll = sum(CYSalariesCompEmpBnftPaidAmt, na.rm = TRUE)) %>%
  filter(!is.na(district))

council_df %<>%
  left_join(council_df_csi, by = c("Council District" = "district"))

pal <- leaflet::colorNumeric(
  palette = RColorBrewer::brewer.pal(9, "BuPu"),
  domain = range(council_df$Nonprofits))

labels <- sprintf("%s<br/>%s",
                  "District " %p% council_df$`Council District`,
                  "Nonprofit HQs: " %p% council_df$Nonprofits) %>%
        lapply(htmltools::HTML)

leaflet(st_as_sf(council_df)) %>%
  addPolygons(
      color = "#444444", weight = 1, 
      smoothFactor = 0.5, opacity = 1.0, fillOpacity = 0.5,
      fillColor = ~pal(Nonprofits),
      label = labels,
      labelOptions = labelOptions(
        style = list("font-weight" = "normal", "font-family" = "Arial", padding = "3px 8px"),
        textsize = "15px",
        direction = "auto"))

Table

council_df %<>%
  select(`Council District`, Nonprofits,  
         `Charitable Social Impact Orgs` = CSIs, Revenue, Payroll, Employees)

council_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by Metro Council District",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)
council_df %>%
  gt() %>%
  fmt_integer(columns = c(Nonprofits)) %>%
  tab_header(title = "Nonprofits by Council District",
             subtitle = "") %>%
  fmt_integer(columns = c(Nonprofits, `Charitable Social Impact Orgs`, Employees)) %>%
  fmt_currency(columns = c(Revenue, Payroll), suffix = T) %>%
  tab_spanner(
    label = "Nonprofits",
    columns = vars(Nonprofits)) %>%
  tab_spanner(
    label = "Charitable Social Impact Organizations",
    columns = vars(`Charitable Social Impact Orgs`, Revenue, Employees, Payroll)) %>%
  cols_label(
    Nonprofits = "Number",
    `Charitable Social Impact Orgs` = "Number",
    Revenue = "Total Revenue") %>%
  apply_table_settings()
Nonprofits by Council District
Council District Nonprofits Charitable Social Impact Organizations
Number Number Total Revenue Employees Payroll
1 104 62 $790.97K 5 $68.37K
2 76 35 $1.83M 17 $379.02K
3 95 55 $97.66M 1,574 $49.36M
4 623 426 $1.46B 14,747 $869.20M
5 133 92 $28.78M 822 $12.84M
6 207 139 $793.46M 4,440 $118.48M
7 261 87 $16.72M 266 $6.31M
8 194 128 $23.80M 373 $9.09M
9 209 135 $132.22M 2,242 $55.95M
10 161 91 $61.64M 828 $29.10M
11 127 79 $3.12M 37 $1.15M
12 77 47 $7.30M 139 $3.83M
13 69 41 $14.25M 1,006 $22.92M
14 53 28 $1.40M 23 $244.20K
15 88 45 $28.55M 245 $7.59M
16 174 88 $6.53M 26 $715.94K
17 116 81 $5.06M 27 $1.22M
18 212 140 $714.05M 6,457 $323.32M
19 243 139 $28.15M 258 $4.88M
20 137 87 $21.73M 293 $6.71M
21 96 48 $2.59M 48 $1.03M
22 90 59 $4.19M 51 $1.91M
23 93 40 $274.25K 0 $0.00
24 59 33 $62.02M 250 $9.88M
25 70 33 $1.39M 10 $211.91K
26 121 77 $130.63M 1,245 $58.77M

By Neighborhood

Map

nh_pop_df <- glpdata::population_nh %>%
  filter(year == 2019,
         sex == "total", 
         race == "total") %>%
  select(neighborhood, population)

nh_df <- bmf_lou %>%
  group_by(neighborhood) %>%
  count() %>%
  ungroup() %>%
  left_join(nh_pop_df, by = "neighborhood") %>%
  transmute(
    Neighborhood = neighborhood,
    Nonprofits = n,
    `Nonprofits per 1000 Residents` = round(n / population * 1000, 1))

nh_df_csi <- bmf_lou %>%
  group_by(neighborhood) %>%
  filter(SUBSECTION == "03", org_type == "Public charity") %>%
  summarize(
    CSIs = n(),
    Revenue = sum(revenue, na.rm = T),
    Employees = sum(employees, na.rm = TRUE),
    Payroll = sum(CYSalariesCompEmpBnftPaidAmt, na.rm = TRUE)) %>%
  filter(!is.na(neighborhood))

nh_df %<>%
  left_join(nh_df_csi, by = c("Neighborhood" = "neighborhood")) %>%
  filter(Neighborhood != "Airport")

make_map(rename(nh_df, neighborhood = Neighborhood), 
         "Nonprofits",
         hover_name = "Nonprofit HQ",
         units = "none")

Table

nh_df %<>%
  rename(`Charitable Social Impact Orgs` = CSIs)

nh_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by Neighborhood",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)
nh_df %>%
  gt() %>%
  tab_header(title = "Nonprofits by Neighborhood",
             subtitle = "") %>%
  fmt_integer(columns = c(Nonprofits, `Charitable Social Impact Orgs`, Employees)) %>%
  fmt_currency(columns = c(Revenue, Payroll), suffix = T) %>%
  tab_spanner(
    label = "Nonprofits",
    columns = vars(Nonprofits)) %>%
  tab_spanner(
    label = "Charitable Social Impact Organizations",
    columns = vars(`Charitable Social Impact Orgs`, Revenue, Employees, Payroll)) %>%
  cols_label(
    Nonprofits = "Number",
    `Charitable Social Impact Orgs` = "Number",
    Revenue = "Total Revenue") %>%
  apply_table_settings()
Nonprofits by Neighborhood
Neighborhood Nonprofits Nonprofits per 1000 Residents Charitable Social Impact Organizations
Number Number Total Revenue Employees Payroll
Algonquin-Park Hill-Park Duvalle 44 2.8 24 $20.54M 286 $13.33M
Buechel-Newburg-Indian Trail 115 3.3 48 $10.66M 132 $3.45M
Butchertown-Clifton-Crescent Hill 147 6.5 101 $106.61M 2,068 $49.78M
California-Parkland 69 7.8 42 $13.03M 140 $4.73M
Chickasaw-Shawnee 86 4.8 59 $6.23M 81 $2.69M
Downtown-Old Louisville-University 505 28.9 343 $1.45B 12,214 $412.98M
Fairdale 17 1.2 12 $6.56M 763 $4.93M
Fern Creek 67 2.4 43 $4.06M 51 $1.91M
Floyd's Fork 228 4.6 126 $8.78M 38 $1.28M
Germantown 77 5.9 46 $16.69M 229 $7.92M
Highlands 151 7.1 88 $25.06M 423 $10.41M
Highview-Okolona 170 2.6 86 $68.00M 486 $27.58M
J-Town 318 5.8 200 $207.56M 3,475 $126.03M
Northeast Jefferson 736 6.2 393 $578.44M 3,858 $216.99M
Phoenix Hill-Smoketown-Shelby Park 163 15.9 119 $749.81M 6,123 $554.55M
Pleasure Ridge Park 88 2.1 55 $7.52M 155 $4.20M
Portland 59 5.8 42 $20.22M 610 $8.46M
Russell 52 5.0 35 $33.93M 647 $12.32M
Shively 116 3.7 66 $13.34M 85 $3.73M
South Central Louisville 79 3.2 42 $93.22M 1,513 $41.44M
South Louisville 154 3.0 85 $4.32M 38 $1.26M
Southeast Louisville 238 4.2 160 $164.87M 1,882 $79.33M
St. Matthews 142 6.8 67 $35.58M 115 $5.78M
Valley Station 60 2.1 31 $1.41M 17 $78.82K

By Zip Code

(The table here is pretty ungainly to look at in my opinion, but you can download it.)

zip_pop_df <- glpdata::population_zip %>%
  filter(year == 2019,
         sex == "total", 
         race == "total") %>%
  select(zip, population_total) %>%
  distinct()

zip_df <- bmf_lou %>%
  group_by(zip) %>%
  count() %>%
  ungroup() %>%
  left_join(zip_pop_df, by = "zip") %>%
  transmute(
    `Zip Code` = zip,
    Nonprofits = n,
    `Nonprofits per 1000 Residents` = 
      round(n / population_total * 1000, 1))

zip_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by Zip Code",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)

# zip_df %>%
#   gt() %>%
#   fmt_integer(columns = c(Nonprofits)) %>%
#   tab_header(title = "Nonprofits by Zip Code",
#              subtitle = "") %>%
#   apply_table_settings()

make_map(rename(zip_df, zip = `Zip Code`), 
         "Nonprofits",
         hover_name = "Nonprofit HQ",
         units = "none")

By CNPE Membership level

cuts <- c(0, 50, 100, 200, 500, 1000, 2000, 5000, Inf) * 1000
  
cut_names <- c("<$50k", "$50k-$100k", "$100k-$200k", "$200k-$500k", "$500k-$1M",
                 "$1M-$2M", "$2M-$5M", "$5M+")

cnpe_levels <- bmf_lou %>%
  filter(SUBSECTION == "03", org_type == "Public charity") %>%
  mutate(revenue_cat = findInterval(revenue, cuts)) %>%
  filter(revenue_cat != 0) %>%
  mutate(revenue_name = factor(cut_names[revenue_cat], ordered = T, levels = cut_names))

cnpe_df <- cnpe_levels %>%
  group_by(revenue_name) %>%
  summarize(
    `Charitable Social Impact Orgs` = n(),
    Revenue = sum(revenue, na.rm = T),
    Employees = sum(employees, na.rm = TRUE),
    Payroll = sum(CYSalariesCompEmpBnftPaidAmt, na.rm = TRUE),
    .groups = "drop") %>%
  rename(`Membership Level` = revenue_name)

cnpe_df %>%
  downloadthis::download_this(
    output_name = "Nonprofits by CNPE Membership Level",
    output_extension = ".csv",
    button_label = "Download data",
    button_type = "warning",
    has_icon = TRUE,
    icon = "fa fa-save",
    csv2 = FALSE)
cnpe_df %>%
  gt() %>%
  tab_header(title = "Nonprofits by CNPE Membership Level",
             subtitle = "") %>%
  fmt_integer(columns = c(`Charitable Social Impact Orgs`, Employees)) %>%
  fmt_currency(columns = c(Revenue, Payroll), suffix = T) %>%
  tab_spanner(
    label = "Charitable Social Impact Organizations",
    columns = vars(`Charitable Social Impact Orgs`, Revenue, Employees, Payroll)) %>%
  cols_label(
    `Charitable Social Impact Orgs` = "Number",
    Revenue = "Total Revenue") %>%
  apply_table_settings()
Nonprofits by CNPE Membership Level
Membership Level Charitable Social Impact Organizations
Number Total Revenue Employees Payroll
<$50k 2,420 $6.20M 43 $1.14M
$50k-$100k 204 $14.98M 95 $2.35M
$100k-$200k 191 $27.29M 150 $4.61M
$200k-$500k 233 $74.37M 559 $16.86M
$500k-$1M 105 $74.79M 745 $25.04M
$1M-$2M 76 $108.69M 1,978 $43.44M
$2M-$5M 83 $265.71M 3,883 $109.29M
$5M+ 91 $3.52B 33,059 $1.54B

Revenue sources

There are a couple of data issues here. The first is with very small organizations, where I need to figure out how to add them in. The 990EZ doesn’t include every revenue category, so I’m having trouble getting all of the numbers to square up. The second data issue is with a program service revenue outlier, which you can clearly see throws off the data for the $2M-$5M category.

rev_source_df <- cnpe_levels %>%
  filter(filing_type == "990") %>%
  group_by(revenue_name) %>%
  summarize(across(c(CYContributionsGrantsAmt,
                     CYProgramServiceRevenueAmt, 
                     GovernmentGrantsAmt, 
                     CYInvestmentIncomeAmt,
                     CYOtherRevenueAmt,
                     revenue),
                          ~ sum(., na.rm=T)),
    .groups = "drop") %>%
  mutate(contributions = CYContributionsGrantsAmt - GovernmentGrantsAmt) %>%
  select(`Membership Level` = revenue_name,
         contributions,
         GovernmentGrantsAmt,
         CYProgramServiceRevenueAmt,
         CYInvestmentIncomeAmt,
         CYOtherRevenueAmt,
         revenue)

Overall

rev_source_df %>%
  gt() %>%
  tab_header(title = "Revenue Sources by CNPE Membership Level",
             subtitle = "") %>%
  fmt_currency(columns = 
                 c(contributions,
                   GovernmentGrantsAmt,
                   CYProgramServiceRevenueAmt,
                   CYInvestmentIncomeAmt,
                   CYOtherRevenueAmt,
                   revenue), suffix = T) %>%
  cols_label(
    contributions = "Contributions and Grants", 
    GovernmentGrantsAmt = "Government Grants", 
    CYProgramServiceRevenueAmt = "Program Service Revenue", 
    CYInvestmentIncomeAmt = "Investment Income", 
    CYOtherRevenueAmt = "Other", 
    revenue = "Total Revenue") %>%
  apply_table_settings()
Revenue Sources by CNPE Membership Level
Membership Level Contributions and Grants Government Grants Program Service Revenue Investment Income Other Total Revenue
<$50k $477.04K $0.00 $250.64K $289.78K $144.72K $938.20K
$50k-$100k $8.21M $784.57K $3.45M −$22.00K $800.84K $14.89M
$100k-$200k $10.53M $1.02M $6.88M $607.85K $1.10M $27.14M
$200k-$500k $32.82M $3.94M $11.89M $1.86M $6.22M $74.37M
$500k-$1M $33.91M $7.20M $18.51M $2.53M $1.46M $73.24M
$1M-$2M $63.49M $19.76M $32.04M $4.85M $5.59M $104.61M
$2M-$5M $93.18M $44.13M $2.02B $56.38M $17.18M $263.59M
$5M+ $519.16M $348.65M $2.15B $147.49M $98.31M $3.52B

Percent of Total

rev_source_df %>%
  mutate(across(contributions:revenue,
                ~. / revenue)) %>%
  gt() %>%
  tab_header(title = "Revenue Sources by CNPE Membership Level",
             subtitle = "") %>%
  fmt_percent(columns = 
                 c(contributions,
                   GovernmentGrantsAmt,
                   CYProgramServiceRevenueAmt,
                   CYInvestmentIncomeAmt,
                   CYOtherRevenueAmt,
                   revenue)) %>%
  cols_label(
    contributions = "Contributions and Grants", 
    GovernmentGrantsAmt = "Government Grants", 
    CYProgramServiceRevenueAmt = "Program Service Revenue", 
    CYInvestmentIncomeAmt = "Investment Income", 
    CYOtherRevenueAmt = "Other", 
    revenue = "Total Revenue") %>%
  apply_table_settings()
Revenue Sources by CNPE Membership Level
Membership Level Contributions and Grants Government Grants Program Service Revenue Investment Income Other Total Revenue
<$50k 50.85% 0.00% 26.71% 30.89% 15.43% 100.00%
$50k-$100k 55.14% 5.27% 23.17% −0.15% 5.38% 100.00%
$100k-$200k 38.82% 3.74% 25.34% 2.24% 4.05% 100.00%
$200k-$500k 44.14% 5.30% 16.00% 2.50% 8.36% 100.00%
$500k-$1M 46.30% 9.83% 25.27% 3.45% 2.00% 100.00%
$1M-$2M 60.70% 18.89% 30.63% 4.64% 5.35% 100.00%
$2M-$5M 35.35% 16.74% 767.41% 21.39% 6.52% 100.00%
$5M+ 14.76% 9.91% 61.05% 4.19% 2.79% 100.00%